{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_dataset(dataset_path):\n",
    "    \"\"\"\n",
    "    加载测试集，并计算其中的 tokens 总数。\n",
    "\n",
    "    :param dataset_path: 测试集的文件路径。\n",
    "    :return: 测试数据集及其 tokens 总数。\n",
    "    \"\"\"\n",
    "    with open(dataset_path, 'r', encoding='utf-8') as file:\n",
    "        dataset = [json.loads(line) for line in file.readlines()]\n",
    "    \n",
    "    # 计算 tokens 总数\n",
    "    total_tokens = sum(len(item['messages'][0]['content']) + len(item['messages'][1]['content']) for item in dataset)*8\n",
    "\n",
    "    return dataset, total_tokens\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "def memory_usage(model_parameters, batch_size, data_type_size=1, overhead_factor=1.5, num_gpus=1):\n",
    "    \"\"\"\n",
    "    估算训练过程中的显存使用量（以GB为单位）。\n",
    "\n",
    "    :param model_parameters: 模型的参数总量。\n",
    "    :param batch_size: 训练过程中的batch大小。\n",
    "    :param data_type_size: 数据类型的大小（以字节为单位），对于32位浮点数通常为4。\n",
    "    :param overhead_factor: 额外开销因子。\n",
    "    :param num_gpus: 使用的 GPU 数量。\n",
    "    :return: 显存使用量（GB）。\n",
    "    \"\"\"\n",
    "    return (model_parameters * data_type_size / 5e10) * (1 + overhead_factor) * batch_size / num_gpus\n",
    "\n",
    "def evaluation_duration(num_tokens, model_parameters, gpu_flops, gpu_utilization=0.3, num_gpus=1):\n",
    "    \"\"\"\n",
    "    估算整个评估过程的持续时间（以天为单位）。\n",
    "\n",
    "    :param num_tokens: 训练过程中使用的tokens总数。\n",
    "    :param model_parameters: 模型的参数总量。\n",
    "    :param gpu_flops: GPU的峰值性能。\n",
    "    :param gpu_utilization: GPU利用率。\n",
    "    :param num_gpus: 使用的 GPU 数量。\n",
    "    :return: 评估持续时间（天）。\n",
    "    \"\"\"\n",
    "    seconds_per_day = 86400\n",
    "    flops_per_token = 8\n",
    "    return (num_tokens * model_parameters * flops_per_token) / (gpu_flops * gpu_utilization * seconds_per_day * num_gpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "测试集中的tokens总数：209712\n",
      "预估显存使用量：52.0 GB per GPU\n",
      "预估评估持续时间：0.01 天，即 19.42 分钟\n"
     ]
    }
   ],
   "source": [
    "# 加载数据集并计算 tokens 总数\n",
    "dataset_path = \"../mnt/datasets/test.jsonl\"\n",
    "dataset, total_tokens = load_dataset(dataset_path)\n",
    "print(f\"测试集中的tokens总数：{total_tokens}\")\n",
    "\n",
    "# 设置模型和训练参数\n",
    "model_parameters = 130e9  # 130亿参数\n",
    "batch_size = 8\n",
    "gpu_flops = 624e12  # 312 TFLOPS\n",
    "num_gpus = 1\n",
    "\n",
    "# 计算显存使用和评估持续时间\n",
    "memory = memory_usage(model_parameters, batch_size, num_gpus=num_gpus)\n",
    "duration = evaluation_duration(total_tokens, model_parameters, gpu_flops, num_gpus=num_gpus)\n",
    "\n",
    "print(f\"预估显存使用量：{memory} GB per GPU\")\n",
    "# duration转为分钟\n",
    "print(f\"预估评估持续时间：{duration:.2f} 天，即 {duration * 24 * 60:.2f} 分钟\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "tim",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
